In [1]:

    
%matplotlib inline

Experimental Model Building

Code for building the models
Author: Jimmy Charité
Email: jimmy.charite@gmail.com

Experimenting with tensorflow



In [2]:

    
import os
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import Image
from IPython.core.display import HTML
import tensorflow as tf



In [3]:

    
retval=os.chdir("..")



In [4]:

    
clean_data=pd.read_pickle('./clean_data/clean_data.pkl')



In [5]:

    
clean_data.head()









    Out[5]:







  
    
      
      helpful
      num_sents
      num_words
      readability
      neg_senti
      pos_senti
      neu_senti
      comp_senti
      text_lemma
      vec0
      ...
      vec290
      vec291
      vec292
      vec293
      vec294
      vec295
      vec296
      vec297
      vec298
      vec299
    
  
  
    
      0
      0.0
      0.693147
      3.610918
      6.742881
      0.079
      0.068
      0.853
      -0.1027
      product arrive label peanut actually small siz...
      0.033346
      ...
      -0.023125
      -0.005069
      0.007344
      -0.045929
      -0.017832
      -0.018206
      -0.017281
      0.012410
      0.020198
      -0.002511
    
    
      1
      0.0
      1.386294
      3.555348
      6.734948
      0.000
      0.448
      0.552
      0.9468
      great taffy great price wide assortment yummy ...
      0.037825
      ...
      -0.015524
      0.009058
      0.020853
      -0.058746
      -0.001076
      -0.013715
      -0.035464
      0.006317
      0.023066
      0.012566
    
    
      2
      0.0
      1.609438
      4.499810
      6.743588
      0.029
      0.163
      0.809
      0.8830
      get wild hair taffy order pound bag taffy enjo...
      0.039023
      ...
      -0.011637
      0.008717
      0.007918
      -0.046595
      -0.012542
      -0.028316
      -0.036677
      0.015261
      0.016227
      0.008930
    
    
      3
      0.0
      1.609438
      4.143135
      6.742527
      0.034
      0.273
      0.693
      0.9346
      saltwater taffy great flavor soft chewy candy ...
      0.038912
      ...
      -0.010440
      0.006156
      0.007695
      -0.039642
      -0.012080
      -0.026868
      -0.018743
      0.009134
      0.021543
      0.016047
    
    
      4
      0.0
      1.609438
      3.526361
      6.737915
      0.000
      0.480
      0.520
      0.9487
      taffy good soft chewy flavor amazing definitel...
      0.043776
      ...
      -0.010004
      -0.003239
      0.014308
      -0.050601
      -0.024100
      -0.023046
      -0.017151
      0.017009
      0.010729
      0.004194
    
  

5 rows × 309 columns



In [6]:

    
kept_cols=['helpful']
kept_cols.extend(clean_data.columns[9:])

Training and Testing Split



In [7]:

    
my_rand_state=0
test_size=0.25



In [8]:

    
from sklearn.model_selection import train_test_split



In [9]:

    
X = (clean_data[kept_cols].iloc[:,1:]).as_matrix()
y = (clean_data[kept_cols].iloc[:,0]).tolist()



In [10]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, 
                                              random_state=my_rand_state)

Setting Up Tensor Flow



In [11]:

    
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=len(X[0,:]))]



In [12]:

    
dnn_clf=tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
                                       hidden_units=[200,100,50],
                                       model_dir='./other_output/tf_model')









    



INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_task_type': None, '_num_worker_replicas': 0, '_tf_random_seed': None, '_is_chief': True, '_task_id': 0, '_master': '', '_keep_checkpoint_max': 5, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fde66fe6ac8>, '_environment': 'local', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_model_dir': './other_output/tf_model', '_save_summary_steps': 100, '_evaluation_master': '', '_session_config': None}



In [13]:

    
from sklearn.preprocessing import StandardScaler
std_scale=StandardScaler()



In [16]:

    
class PassData(object):
    '''
    Callable object that can be initialized and 
    used to pass data to tensorflow
    '''
    
    def __init__(self,X,y):
        self.X=X
        self.y=y
            
    def scale(self):
        self.X = std_scale.fit_transform(self.X, self.y)       
    
    def __call__(self):
        return tf.constant(X), tf.constant(y)



In [17]:

    
train_data=PassData(X,y)



In [ ]:

    
train_data.scale()



In [ ]:

    
dnn_clf.fit(input_fn=train_data,steps=1000)









    



WARNING:tensorflow:From /home/jim/anaconda2/envs/py35/lib/python3.5/site-packages/tensorflow/contrib/learn/python/learn/estimators/head.py:615: scalar_summary (from tensorflow.python.ops.logging_ops) is deprecated and will be removed after 2016-11-30.
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer supported.
INFO:tensorflow:Create CheckpointSaverHook.

Testing Estimators



In [ ]:

    
from sklearn.metrics import roc_curve, auc



In [ ]:

    
nb_fpr, nb_tpr, _ = roc_curve(y_test, 
                    nb_clf_est_b.predict_proba(X_test)[:,1])
nb_roc_auc = auc(nb_fpr, nb_tpr)

qda_fpr, qda_tpr, _ = roc_curve(y_test, 
                    qda_clf_est_b.predict_proba(X_test)[:,1])
qda_roc_auc = auc(qda_fpr, qda_tpr)

log_fpr, log_tpr, _ = roc_curve(y_test, 
                    log_clf_est_b.predict_proba(X_test)[:,1])
log_roc_auc = auc(log_fpr, log_tpr)

rf_fpr, rf_tpr, _ = roc_curve(y_test, 
                    rf_clf_est_b.predict_proba(X_test)[:,1])
rf_roc_auc = auc(rf_fpr, rf_tpr)



In [ ]:

    
plt.plot(nb_fpr, nb_tpr, color='cyan', linestyle='--',
         label='NB (area = %0.2f)' % nb_roc_auc, lw=2)

plt.plot(qda_fpr, qda_tpr, color='indigo', linestyle='--',
         label='QDA (area = %0.2f)' % qda_roc_auc, lw=2)

plt.plot(log_fpr, log_tpr, color='seagreen', linestyle='--',
         label='LOG (area = %0.2f)' % log_roc_auc, lw=2)

plt.plot(rf_fpr, rf_tpr, color='blue', linestyle='--',
         label='RF (area = %0.2f)' % rf_roc_auc, lw=2)

plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
         label='Luck')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves of Basic Models Using BOW & Macro-Text Stats')
plt.legend(loc="lower right")
plt.savefig('./plots/ROC_Basic_BOW_MERGED.png', bbox_inches='tight')
plt.show()

	num_sents	num_words	readability	neg_senti	pos_senti	neu_senti	comp_senti	text_lemma	vec0	...	vec290	vec291	vec292	vec293	vec294	vec295	vec296	vec297	vec298	vec299
0	0.693147	3.610918	6.742881	0.079	0.068	0.853	-0.1027	product arrive label peanut actually small siz...	0.033346	...	-0.023125	-0.005069	0.007344	-0.045929	-0.017832	-0.018206	-0.017281	0.012410	0.020198	-0.002511
1	1.386294	3.555348	6.734948	0.000	0.448	0.552	0.9468	great taffy great price wide assortment yummy ...	0.037825	...	-0.015524	0.009058	0.020853	-0.058746	-0.001076	-0.013715	-0.035464	0.006317	0.023066	0.012566
2	1.609438	4.499810	6.743588	0.029	0.163	0.809	0.8830	get wild hair taffy order pound bag taffy enjo...	0.039023	...	-0.011637	0.008717	0.007918	-0.046595	-0.012542	-0.028316	-0.036677	0.015261	0.016227	0.008930
3	1.609438	4.143135	6.742527	0.034	0.273	0.693	0.9346	saltwater taffy great flavor soft chewy candy ...	0.038912	...	-0.010440	0.006156	0.007695	-0.039642	-0.012080	-0.026868	-0.018743	0.009134	0.021543	0.016047
4	1.609438	3.526361	6.737915	0.000	0.480	0.520	0.9487	taffy good soft chewy flavor amazing definitel...	0.043776	...	-0.010004	-0.003239	0.014308	-0.050601	-0.024100	-0.023046	-0.017151	0.017009	0.010729	0.004194